{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pickle\n",
    "from scipy import sparse\n",
    "from sklearn.preprocessing import OneHotEncoder,LabelEncoder\n",
    "from sklearn.model_selection import KFold, cross_val_score, train_test_split\n",
    "from sklearn.metrics import roc_auc_score, log_loss\n",
    "import lightgbm as lgb\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 读取特征矩阵\n",
    "with open('../data/temp.pkl', 'rb') as file:\n",
    "    data = pickle.load(file)\n",
    "\n",
    "### 分成不同的特征部分\n",
    "is_feature = ['creative_is_download', 'creative_is_jump', 'creative_has_deeplink']\n",
    "rate_feature = [i for i in data.columns if '_rate' in i]\n",
    "cate_feature = ['f_channel',  'adid', 'inner_slot_id', 'os', 'advert_industry_inner', 'clear_model', 'devtype', 'clear_osv', \\\n",
    "                'creative_height', 'creative_type', 'sim_ip', 'advert_name', 'orderid', 'carrier', 'app_id', 'creative_id', \\\n",
    "                'creative_tp_dnf','app_cate_id', 'creative_width', 'clear_make', 'province', 'nnt', 'city', 'campaign_id'] + is_feature\n",
    "ignore_feature = ['instance_id', 'click', 'period']\n",
    "num_feature = [i for i in data.columns if '_num' in i] + ['hour'] + rate_feature"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 为了测量结果，使用Logloss衡量，且将训练集划分为：训练集+测试集+验证集(按照时间划分)\n",
    "### period小于33的为训练集，period=33随即划分为验证集+测试集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "total_train = data[data.click != -1]\n",
    "train = total_train[total_train.period <= 32][num_feature + cate_feature+ignore_feature]\n",
    "train_y = total_train[total_train.period <= 32]['click'] ##标签\n",
    "\n",
    "val_and_test = total_train[total_train.period == 33][num_feature + cate_feature+ignore_feature]\n",
    "val_and_test_y = total_train[total_train.period == 33]['click']\n",
    "val, test, val_y, test_y = train_test_split(val_and_test, val_and_test_y, test_size=0.5, random_state=1024)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 用gbdt训练类别型变量，得到叶子节点拼接类别型，最后使用LR模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#用gbdt训练类别型变量，得到叶子节点拼接类别型，最后使用LR模型\n",
    "# 模型部分\n",
    "lgb_clf = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=48, max_depth=-1, learning_rate=0.05, n_estimators=350,\n",
    "                           max_bin=425, subsample_for_bin=50000, objective='binary', min_split_gain=0,\n",
    "                           min_child_weight=5, min_child_samples=150, subsample=0.8, subsample_freq=1,\n",
    "                           colsample_bytree=1, reg_alpha=3, reg_lambda=5, seed=1000, n_jobs=10, silent=True)\n",
    "\n",
    "train_csr = train[num_feature]\n",
    "val_csr = val[num_feature]\n",
    "test_csr = test[num_feature]\n",
    "train_y = train_y.values\n",
    "\n",
    "#只提取最后100维数据\n",
    "lgb_clf.fit(train_csr, train_y)\n",
    "new_feature_train = lgb_clf.apply(train_csr)[:,-100:]   \n",
    "new_feature_val = lgb_clf.apply(val_csr)[:,-100:]\n",
    "new_feature_test = lgb_clf.apply(test_csr)[:,-100:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "### 重命名GBDT的结果\n",
    "for i in range(new_feature_train.shape[1]):\n",
    "    train['gbdt_'+str(i)] = new_feature_train[:, i]  \n",
    "    val['gbdt_'+str(i)] = new_feature_val[:, i]  \n",
    "    test['gbdt_'+str(i)] = new_feature_test[:, i]  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 下面的视情况而定是否需要，也就是存储的全为离散值的特征矩阵（将数值特征转为类别特征）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ###将结果拼接起来，并用pickle存储\n",
    "# # train['click'], val['click'], test['click'] = train_y, val_y.values, test_y.values\n",
    "# data = pd.concat((pd.concat((train, val), axis = 0), test), axis = 0)\n",
    "\n",
    "# import pickle\n",
    "# ##存储中间特征矩阵便于再次访问\n",
    "# with open('../data/GBDT_LR_feature.pkl', 'wb') as file:\n",
    "#     pickle.dump(data, file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "###拼接GBDT的结果的新的类别变量\n",
    "cate_feature = cate_feature + [i for i in data.columns if 'gbdt_' in i]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### CTR预估常用方法，转换为One-hot高维稀疏数据，为了节省内存，使用CSR矩阵存储"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "one-hot prepared !\n"
     ]
    }
   ],
   "source": [
    "total_data = pd.concat((train, val, test), axis = 0)\n",
    "base_train_csr = sparse.csr_matrix((len(train), 0))\n",
    "base_val_csr = sparse.csr_matrix((len(val), 0))\n",
    "base_test_csr = sparse.csr_matrix((len(test), 0))\n",
    "\n",
    "enc = OneHotEncoder()\n",
    "for feature in cate_feature:\n",
    "    enc.fit(total_data[feature].values.reshape(-1, 1))\n",
    "    base_train_csr = sparse.hstack((base_train_csr, enc.transform(train[feature].values.reshape(-1, 1))), 'csr', 'bool')\n",
    "    base_val_csr = sparse.hstack((base_val_csr, enc.transform(val[feature].values.reshape(-1, 1))),'csr', 'bool')\n",
    "    base_test_csr = sparse.hstack((base_test_csr, enc.transform(test[feature].values.reshape(-1, 1))),'csr', 'bool')\n",
    "print('one-hot prepared !')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### LR调参过程"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "训练集shape (844439, 6751) 验证集shape (78605, 6751) 测试集shape (78606, 6751)\n",
      "0.05\n",
      "[LibLinear]得到epcoh参数的过程loss 0.42878968705618953\n",
      "\n",
      "\n",
      "0.1\n",
      "[LibLinear]得到epcoh参数的过程loss 0.4287613127930187\n",
      "\n",
      "\n",
      "0.001\n",
      "[LibLinear]得到epcoh参数的过程loss 0.4332894429289333\n",
      "\n",
      "\n",
      "0.01\n",
      "[LibLinear]得到epcoh参数的过程loss 0.4292464932493843\n",
      "\n",
      "\n",
      "0.2\n",
      "[LibLinear]得到epcoh参数的过程loss 0.4287804222512716\n",
      "\n",
      "\n",
      "0.005\n",
      "[LibLinear]得到epcoh参数的过程loss 0.42979633676820933\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#调参C\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "print('训练集shape', base_train_csr.shape, '验证集shape', base_val_csr.shape, '测试集shape', base_test_csr.shape)\n",
    "#使用验证集调参\n",
    "for c in [0.05, 0.1, 0.001, 0.01, 0.2, 0.005]:\n",
    "    print(c)\n",
    "    model = LogisticRegression(C = c, verbose = 10)#C = 5\n",
    "    model.fit(base_train_csr, train_y)\n",
    "    \n",
    "    train_pred= model.predict_proba(base_val_csr)[:, 1]\n",
    "    print('得到epcoh参数的过程loss', log_loss(val_y, train_pred))\n",
    "    print('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[LibLinear]测试集Logloss  0.4254597101770659\n"
     ]
    }
   ],
   "source": [
    "### 用上面最好的参数c=0.1来训练模型并做验证\n",
    "model = LogisticRegression(C = 0.1, verbose = 10)#C = 5\n",
    "model.fit(base_train_csr, train_y)\n",
    "\n",
    "train_pred= model.predict_proba(base_test_csr)[:, 1]\n",
    "print('测试集Logloss ', log_loss(test_y, train_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
